PANDAS,NUMPY for data manuplation.
Matplotlib,seaborn module for Data Visualisation.
os for path setting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import seaborn as sns
import warnings
data_dir ='C:\\Users\\Keerthi\\Desktop\\04-Keerthi\\KPMG'
os.chdir(data_dir)
As the first column can be ignored, importing the data using pandas library from the second coulmn
transactions = pd.read_excel("KPMG_VI_New_raw_data_update_final.xlsx", header=1, sheet_name='Transactions',)
customer_demographic = pd.read_excel("KPMG_VI_New_raw_data_update_final.xlsx", header=1, sheet_name='CustomerDemographic')
customer_address = pd.read_excel("KPMG_VI_New_raw_data_update_final.xlsx", header=1, sheet_name='CustomerAddress')
print(' The shape of Transactions is ',transactions.shape)
print('Customer Transactions Information:-',transactions.info())
print(' The shape of Customer Demographics is ',customer_demographic.shape)
print('Customer Customer Demographics Information:-',customer_demographic.info())
print(' The shape of Customer Address is ',customer_address.shape)
print('Customer Customer Address Information:-',customer_address.info())
* Columns present in new_customer_lists are combination of Customer Demographic with the Customer Address but Customer Id is missing
* Unnamed columns in new_customer_lists can be dropped
set(customer_demographic.customer_id).symmetric_difference(customer_address.customer_id)
customer = pd.merge(customer_demographic, customer_address, how='left', on='customer_id')
customer.head()
customer['default'].value_counts()
drop_col=['default']
customer = customer.drop(drop_col, axis=1)
For achieving better results from the applied model in Machine Learning projects the format of the data has to be in a proper manner.
We will have to check for the following
(i) Missing values analysis
(ii) Outliers analysis
(iii) Columns Analysis
print(customer.isnull().sum())
sns.heatmap(customer.isnull(), cbar=False,cmap='rainbow')
Observations :
* DOB will be converted to age and handled.
* Rank, Value columns can be dropped
* Update last_name,Job_title and job_industry_category with 'unknown'
* Drop rows with NA values for address , country
* Update tenure with mean value.
customer['job_title'].fillna('Unknown', inplace=True)
customer['job_industry_category'].fillna('Unknown', inplace=True)
customer['last_name'].fillna('Unknown', inplace=True)
Check
# tenure
customer['tenure'].fillna((customer['tenure'].mean()), inplace=True)
def age(date):
age=(pd.to_datetime('today').year-pd.to_datetime(date).year)
return age
customer['age'] = customer['DOB'].apply(age)
customer= customer.drop('DOB',axis=1)
# Filling Null values with mode
customer['age'].fillna((customer['age'].mode()[0]), inplace=True)
customer.loc[(customer["age"] == 177)]
We can observe one data which is outlier with 177 years old, this row will be dropped.
customer = customer[customer.age != 177]
print('Number of rows in Customer -->',customer.shape)
customer['gender'].value_counts()
customer=customer.dropna()
print(customer.isnull().sum())
plt.figure(figsize=(20, 10))
X=customer
col = ['past_3_years_bike_related_purchases','property_valuation','tenure','age']
# i: index
for i, col in enumerate(X[col]):
plt.subplot(3, 4, i+1)
x = customer[col]
sns.boxplot(x)
plt.title(col)
customer[customer.duplicated()].sum()
summary statistics can be created for pandas dataframe using describe method
print(customer.describe())
customer['age_group'] = pd.cut(x=customer['age'], bins=[0,10,20,30,40,50,60,70,80,90], labels=['0-10','10-20','20-30',\
'30-40','40-50','50-60','60-70','70-80','80-90'], right=True)
fig,ax=plt.subplots(figsize=(7,5))
sns.countplot(x ='age_group',data=customer)
plt.title('Age Group Distribution of Present Customer')
plt.show()
customer ['state'].value_counts()
customer['state'].replace({'New South Wales': 'NSW', 'Victoria':'VIC'}, inplace=True)
customer ['state'].value_counts()
plt.figure(figsize=(5,4))
ax=sns.countplot("state", data = customer,order = customer['state'].value_counts().index)
plt.xticks(rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x()+0.35, p.get_height()+1))
plt.show()
customer['country'].value_counts()
customer['deceased_indicator'].value_counts()
As we have 2 rows in Y category , It doesnt provide much insight. Dropping the column
customer.drop(columns=['deceased_indicator'], inplace = True)
customer['gender'].value_counts()
customer['gender'].replace({'M': 'Male', 'Femal':'Female','F':"Female",'U':"Unknown"}, inplace=True)
customer['gender'].value_counts()
plt.figure(figsize=(5,4))
ax=sns.countplot("gender", data = customer,order = customer['gender'].value_counts().index)
plt.xticks(rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x()+0.35, p.get_height()+1))
plt.show()
customer['job_industry_category'].value_counts()
plt.figure(figsize=(5,4))
ax=sns.countplot("job_industry_category", data = customer,order = customer['job_industry_category'].value_counts().index)
plt.xticks(rotation=100)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x()+0.35, p.get_height()+1))
plt.show()
customer['job_title'].value_counts()
customer['owns_car'].value_counts()
plt.figure(figsize=(5,4))
ax=sns.countplot("owns_car", data = customer,order = customer['owns_car'].value_counts().index)
plt.xticks(rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x()+0.35, p.get_height()+1))
plt.show()
customer['wealth_segment'].value_counts()
plt.figure(figsize=(5,4))
ax=sns.countplot("wealth_segment", data = customer,order = customer['wealth_segment'].value_counts().index)
plt.xticks(rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x()+0.35, p.get_height()+1))
plt.show()
customer_cor = customer.corr()
sns.heatmap(customer_cor, annot=True,cmap='rainbow')
We can observe that Property_valuation and Postcode are in negative correlation
Tenure and age are in postive correlation
print(' The shape of column is ',transactions.shape)
transactions.info()
transactions.head()
For achieving better results from the applied model in Machine Learning projects the format of the data has to be in a proper manner.
We will have to check for the following
(i) Missing values analysis
(ii) Outliers analysis
(iii) Columns Analysis
transactions.isnull().sum()
sns.heatmap(transactions.isnull(), cbar=False,cmap='rainbow')
online_order 360 brand 197 product_line 197 product_class 197 product_size 197 standard_cost 197 product_first_sold_date 197 ,
transactions.dropna(subset= ['online_order'], inplace = True)
transactions.dropna(subset= ['standard_cost'], inplace = True)
transactions.dropna(subset= ['product_first_sold_date'], inplace = True)
transactions['brand'].fillna(value = 'Unknown', inplace = True)
transactions['product_line'].fillna(value = 'Unknown', inplace = True)
transactions['product_size'].fillna(value = 'Unknown', inplace = True)
sns.heatmap(transactions.isnull(), cbar=False,cmap='rainbow')
plt.figure(figsize=(20, 10))
X=transactions
col = ['list_price','standard_cost','customer_id']
# i: index
for i, col in enumerate(X[col]):
plt.subplot(3, 4, i+1)
x = transactions[col]
sns.boxplot(x)
plt.title(col)
transactions[transactions.duplicated()].sum()
summary statistics can be created for pandas dataframe using describe method
print(transactions.describe())
We can observe that customer_id has a outlier value '5034'
transactions['customer_id'].max()
We can see that Customer_id doesnt existis so this row will be dropped
transactions = transactions[transactions.customer_id != 5034]
transactions['order_status'].value_counts()
plt.figure(figsize=(7,4))
ax=sns.countplot("order_status", data = transactions,order = transactions['order_status'].value_counts().index)
plt.xticks(rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
plt.show()
transactions['brand'].value_counts()
plt.figure(figsize=(7,4))
ax=sns.countplot("brand", data = transactions,order = transactions['brand'].value_counts().index)
plt.xticks(rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
plt.show()
transactions['product_line'].value_counts()
plt.figure(figsize=(7,4))
ax=sns.countplot("product_line", data = transactions,order = transactions['product_line'].value_counts().index)
plt.xticks(rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
plt.show()
transactions['product_class'].value_counts()
plt.figure(figsize=(7,4))
ax=sns.countplot("product_class", data = transactions,order = transactions['product_class'].value_counts().index)
plt.xticks(rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
plt.show()
transactions['product_size'].value_counts()
plt.figure(figsize=(7,4))
ax=sns.countplot("product_size", data = transactions,order = transactions['product_size'].value_counts().index)
plt.xticks(rotation=90)
for p in ax.patches:
ax.annotate(format(p.get_height()), (p.get_x()+0.15, p.get_height()+1))
plt.show()
transactions['product_first_sold_date'] = pd.to_datetime(transactions['product_first_sold_date'], unit='s')
transactions['product_first_sold_date'].head()
transactions_corr = transactions.corr()
sns.heatmap(transactions_corr, annot=True,cmap='rainbow')
We can observe that list_price and standard cost are in postive correlaion. They can be used in calculating profit
new_customer_lists = pd.read_excel("KPMG_VI_New_raw_data_update_final.xlsx", header=1, sheet_name='NewCustomerList')
new_customer_lists
Df= customer[['first_name','last_name']].isin(new_customer_lists[['first_name','last_name']])
Df.head()
Df.loc[(Df["first_name"] == True) & (Df["last_name"] == True)]
* The customers present in customer_demographic and new_customer_lists are entirely different.
* Adding Customer_id with serial number starting from 4004, As 4003 was the last customer_id which is used in customer_demographic
new_customer_lists.insert(0, 'customer_id', range(4004, 4004 + len(new_customer_lists)))
new_customer_lists.head()
print('shape of data -->',new_customer_lists.shape)
print('\nColumns of New Customer data\n -->',new_customer_lists.columns)
new_customer_lists.dtypes
For achieving better results from the applied model in Machine Learning projects the format of the data has to be in a proper manner.
We will have to check for the following
(i) Missing values analysis
(ii) Outliers analysis
(iii) Columns Analysis
print(new_customer_lists.isnull().sum())
sns.heatmap(new_customer_lists.isnull(), cbar=False,cmap='rainbow')
Observations :
* DOB will be converted to age and mode value is imputed.
* Rank, Value columns are be dropped
* Update last_name,Job_title and job_industry_category with 'unknown'
* Drop rows with NA values for address , country
* Update tenure with mean value.
drop_col=['Rank','Value','Unnamed: 16' ,'Unnamed: 17','Unnamed: 18','Unnamed: 19','Unnamed: 20']
new_customer_lists = new_customer_lists.drop(drop_col, axis=1)
new_customer_lists['job_title'].fillna('Unknown', inplace=True)
new_customer_lists['job_industry_category'].fillna('Unknown', inplace=True)
new_customer_lists['last_name'].fillna('Unknown', inplace=True)
def age(date):
age=(pd.to_datetime('today').year-pd.to_datetime(date).year)
return age
new_customer_lists['age'] = new_customer_lists['DOB'].apply(age)
new_customer_lists= new_customer_lists.drop('DOB',axis=1)
# Filling Null values with mode
new_customer_lists['age'].fillna((new_customer_lists['age'].mode()[0]), inplace=True)
data = transactions.merge(customer, how='inner', on='customer_id')
print('Shape of the Dataset-->',data.shape)
print ('Columns names -->', (data.columns))
print ('Columns Datatype -->', data.dtypes)
transactions['profit'] = transactions['list_price']-transactions['standard_cost']
def annot_plot(ax,w,h):
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
for p in ax.patches:
ax.annotate('{}'.format(p.get_height()), (p.get_x()+w, p.get_height()+h))
(i)Brand Analysis
(ii) Gender Analysis
(iii) Geographic Analysis
(iv) Age Group Analysis
ax=pd.crosstab(data.brand, data.online_order).plot(kind='bar', stacked=False)
plt.xticks(rotation=105)
plt.title('Online Orders in each Brand')
plt.show()
#plt.figure(figsize=(20,20))
pd.crosstab(data.brand, data.product_size).plot(kind='bar', stacked=True)
plt.xticks(rotation=105)
for i, v in enumerate(data.groupby('brand') \
.agg({'product_size': lambda x: len(x)})['product_size']):
plt.text(i - .25, v + 10, int(v), color='black')
#axes = plt.gca()
plt.title('Product Size in each Brand')
plt.show()
pd.crosstab(data.brand, data.product_class).plot(kind='bar', stacked=True)
plt.xticks(rotation=105)
plt.title('Product Class in each Brand')
for i, v in enumerate(data.groupby('brand') \
.agg({'product_class': lambda x: len(x)})['product_class']):
plt.text(i - .25, v + 10, int(v), color='black')
#axes = plt.gca()
plt.title('Product Class in each Brand')
plt.show()
pd.crosstab(data.brand, data.product_line).plot(kind='bar', stacked=True)
plt.xticks(rotation=105)
plt.title('Product Line in each Brand')
plt.show()
Highest selling brand : Solex
Highest Selling Product size : Medium
Highest Selling Product Class : Medium
Highest Selling Product line : Standard
Customer_state = customer.groupby("gender").past_3_years_bike_related_purchases.sum()
Customer_state.plot(kind='bar')
plt.title('Number of transactions per gender')
pd.crosstab(data.brand, data.gender).plot(kind='bar', stacked=False)
plt.title('Number of Brands used per gender')
pd.crosstab(data.job_industry_category, data.gender).plot(kind='bar', stacked=False)
plt.title('Bicycles used in each industry per gender')
plt.show()
pd.crosstab(data.wealth_segment, data.gender).plot(kind='bar', stacked=False)
plt.title('Bicycles used in each Wealth segment per gender')
plt.show()
pd.crosstab(data.age_group, data.gender).plot(kind='bar', stacked=False)
plt.title('Bicycles used in each age group segment per gender')
plt.show()
Largest buyer : Female
Highest Selling Brand by Females : Solex
Highest Selling in Industry category by Females : Manufacturing
Highest Selling in wealth segment by females : Mass Customers
Highest selling in Age group by Females : 40-50
x = data['state']
x = x.value_counts(normalize=True)*100
x=x.reset_index()
labels = x['index']
explode = [0.0,0.0,0.1];
fig,ax = plt.subplots(figsize=(5,5))
plt.pie(x['state'], explode=explode, labels = labels,autopct='%1.1f%%', shadow=True, startangle=0)
plt.axis('equal')
plt.title('Number of transactions in each State')
plt.show()
import plotly.express as px
px.scatter(data_frame=data, x='state', y='gender', size='past_3_years_bike_related_purchases')
pd.crosstab(data.state, data.gender).plot(kind='bar', stacked=False)
plt.title('Bicycles used in each state per gender')
plt.show()
pd.crosstab(data.state, data.wealth_segment).plot(kind='bar', stacked=False)
plt.title('Bicycles used in each Wealth segment per state')
plt.show()
pd.crosstab(data.state, data.brand).plot(kind='bar', stacked=False)
plt.title('Bicycles used in each state per brand')
plt.show()
pd.crosstab(data.state, data.product_class).plot(kind='bar', stacked=False)
plt.title('Bicycles used in each product Class per gender')
plt.show()
pd.crosstab(data.state, data.product_line).plot(kind='bar', stacked=False)
plt.title('Bicycles used in each Product line per state')
plt.show()
pd.crosstab(data.state, data.product_size).plot(kind='bar', stacked=False)
plt.title('Bicycles used in each product size per state')
plt.show()
Customer_state = customer.groupby("state").past_3_years_bike_related_purchases.sum()
Customer_state.plot(kind='bar')
plt.title('Number of transactions per state')
pd.crosstab(data['state'],data['owns_car']).plot(kind='bar')
plt.title('Bicycles used still they own car per state')
plt.show()
Highest selling states : NSW
Gender contribution : Female
wealth segment contribution : Mass customers
Brand contribution :Solex
Car Owns highest in : NWS
sns.barplot(x = "age_group",y= "past_3_years_bike_related_purchases",data = customer)
plt.title('Bicycles related purchases in 3-years in each age group')
plt.show()
sns.countplot(x = "age_group",data = data,hue="gender")
plt.title('Bicycles used in each age group')
plt.show()
plt.figure(figsize=(11,8))
sns.barplot(x=data.job_industry_category,y=data.list_price,hue=data.gender)
We can see that age group 80-90 are having highest purchase realted to bike
Highest sales contributed from age group 40-50
product = data[['product_line','product_size']]
product = product.groupby(['product_line','product_size']).agg(len).reset_index()
product['Product'] = product['product_line'].str.cat(product['product_size'], sep =",")
product = product[['Product',0]]
product.columns = ['Product Category','Count']
val = product['Count']
fig,ax = plt.subplots(figsize=(8,8))
sns.barplot(x = 'Count', y='Product Category',data=product,ax=ax)
plt.title('Product Category Most sold by company')
for i in range(len(val)):
plt.text(x = val[i]+0.7, y = i, s = val[i], size = 12)
Here we can see Standard,medium are having highest sales.
Followed by standard,large
year_profit = transactions[['transaction_date','profit']]
year_profit['month'] = pd.DatetimeIndex(year_profit['transaction_date']).month
month_profit = year_profit.groupby(['month']).agg(sum)
month_profit.reset_index(inplace=True)
fig,ax=plt.subplots(figsize=(8,8))
month_profit.plot(x = 'month',y='profit',kind='line',ax=ax,grid=True)
plt.ylabel('Profits')
plt.xticks([2,4,6,8,10,12],['Feb','Apr','June','Aug','Oct','Dec'])
plt.show()
with pd.ExcelWriter('KPMG_VI_New_raw_data_cleaned.xlsx') as writer:
data.to_excel(writer, sheet_name='OldCustomers')
new_customer_lists.to_excel(writer, sheet_name='NewCustomers')
new_customer_lists.columns
new_customer_lists.describe()
x= new_customer_lists.iloc[:,[16,4]].values
x
from sklearn.cluster import KMeans
wss=[]
for i in range(1,8):
kmeans=KMeans(n_clusters=i,init='k-means++',random_state=23)
kmeans.fit(x)
wss.append(kmeans.inertia_)
plt.plot(range(1,8),wss)
plt.title('Elbow Method to find K')
plt.xlabel('Number of Clusters')
plt.ylabel('WSS')
plt.show()
As the number of clusters increases, the value of wcss decreases. The optimal number of clusters is 5.
kmeans=KMeans(n_clusters=4,init='k-means++',random_state=23)
pred_kmeans = kmeans.fit_predict(x)
plt.scatter(x[pred_kmeans==0,0],x[pred_kmeans==0,1],s=10,c='red',label='Cluster1')
plt.scatter(x[pred_kmeans==1,0],x[pred_kmeans==1,1],s=10,c='teal',label='Cluster2')
plt.scatter(x[pred_kmeans==2,0],x[pred_kmeans==2,1],s=10,c='orange',label='Cluster3')
plt.scatter(x[pred_kmeans==3,0],x[pred_kmeans==3,1],s=10,c='green',label='Cluster4')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=100, c='purple',label='Centriod')
plt.title('Clusters of Customers')
plt.xlabel('Age')
plt.ylabel('Bike related purchases')
plt.legend()
plt.show()
x= new_customer_lists.iloc[:,[12,4]].values
from sklearn.cluster import KMeans
wss=[]
for i in range(1,8):
kmeans=KMeans(n_clusters=i,init='k-means++',random_state=23)
kmeans.fit(x)
wss.append(kmeans.inertia_)
plt.plot(range(1,8),wss)
plt.title('Elbow Method to find K')
plt.xlabel('Number of Clusters')
plt.ylabel('WSS')
plt.show()
As the number of clusters increases, the value of wcss decreases. The optimal number of clusters is 3.
kmeans=KMeans(n_clusters=3,init='k-means++',random_state=23)
pred_kmeans = kmeans.fit_predict(x)
plt.scatter(x[pred_kmeans==0,0],x[pred_kmeans==0,1],s=10,c='red',label='Cluster1')
plt.scatter(x[pred_kmeans==1,0],x[pred_kmeans==1,1],s=10,c='teal',label='Cluster2')
plt.scatter(x[pred_kmeans==2,0],x[pred_kmeans==2,1],s=10,c='orange',label='Cluster3')
plt.scatter(x[pred_kmeans==3,0],x[pred_kmeans==3,1],s=10,c='green',label='Cluster4')
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],s=100, c='purple',label='Centriod')